##################################
# Loading R libraries
##################################
library(datasets)
library(caret)
library(moments)
library(missCompare)
library(mi)
library(missMDA)
library(pcaMethods)
library(missForest)
library(rms)
library(mice)
library(VIM)
library(misty)
library(Hmisc)
library(Amelia)
library(foreign)
library(RBtest)
######################################
# Setting the working directory
######################################
setwd("D:/R_Codes/Markdown/MissingDataImputation")
##################################
# Loading dataset
##################################
TBI <- read.spss("TBI.sav",use.value.labels=F,to.data.frame=T)
##################################
# Performing a general exploration of the dataset
##################################
dim(TBI)## [1] 2159 24
summary(TBI)## trial d.gos d.mort d.unfav
## Length:2159 Min. :1.000 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:2.000 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :4.000 Median :0.000 Median :0.0000
## Mean :3.543 Mean :0.233 Mean :0.3942
## 3rd Qu.:5.000 3rd Qu.:0.000 3rd Qu.:1.0000
## Max. :5.000 Max. :1.000 Max. :1.0000
##
## cause age d.motor d.pupil
## Min. :3.000 Min. :14.00 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:22.00 1st Qu.:3.000 1st Qu.:1.000
## Median :4.000 Median :30.00 Median :4.000 Median :1.000
## Mean :4.908 Mean :33.21 Mean :3.991 Mean :1.458
## 3rd Qu.:6.000 3rd Qu.:43.00 3rd Qu.:5.000 3rd Qu.:2.000
## Max. :9.000 Max. :79.00 Max. :6.000 Max. :3.000
## NA's :123
## pupil.i hypoxia hypotens ctclass
## Min. :1.000 Min. :0.0000 Min. :0.0000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:2.000
## Median :1.000 Median :0.0000 Median :0.0000 Median :3.000
## Mean :1.462 Mean :0.2167 Mean :0.1795 Mean :3.281
## 3rd Qu.:2.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:5.000
## Max. :3.000 Max. :1.0000 Max. :1.0000 Max. :6.000
## NA's :249 NA's :59 NA's :25
## tsah edh cisterns shift
## Min. :0.0000 Min. :0.0000 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :1.000 Median :0.0000
## Mean :0.4779 Mean :0.1268 Mean :1.455 Mean :0.1987
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :3.000 Max. :1.0000
## NA's :96 NA's :38 NA's :257 NA's :252
## d.sysbpt glucose glucoset ph
## Min. : 60.0 Min. : 0.5556 Min. : 3.000 Min. :6.790
## 1st Qu.:121.1 1st Qu.: 6.7000 1st Qu.: 6.700 1st Qu.:7.320
## Median :131.3 Median : 8.2000 Median : 8.200 Median :7.380
## Mean :131.4 Mean : 8.9078 Mean : 8.863 Mean :7.376
## 3rd Qu.:142.1 3rd Qu.:10.4000 3rd Qu.:10.400 3rd Qu.:7.450
## Max. :207.4 Max. :41.4000 Max. :20.000 Max. :7.660
## NA's :64 NA's :64 NA's :457
## sodium sodiumt hb hbt
## Min. :114.0 Min. :125.0 Min. : 2.50 Min. : 6.00
## 1st Qu.:137.0 1st Qu.:137.0 1st Qu.:10.90 1st Qu.:10.90
## Median :140.0 Median :140.0 Median :12.90 Median :12.90
## Mean :139.3 Mean :139.3 Mean :12.45 Mean :12.47
## 3rd Qu.:142.0 3rd Qu.:142.0 3rd Qu.:14.30 3rd Qu.:14.30
## Max. :154.0 Max. :154.0 Max. :18.60 Max. :17.00
## NA's :62 NA's :62 NA's :24 NA's :24
describe(TBI)## TBI
##
## 24 Variables 2159 Observations
## --------------------------------------------------------------------------------
## trial
## n missing distinct
## 2159 0 2
##
## Value 74 75
## Frequency 1118 1041
## Proportion 0.518 0.482
## --------------------------------------------------------------------------------
## d.gos
## n missing distinct Info Mean Gmd
## 2159 0 5 0.894 3.543 1.726
##
## lowest : 1 2 3 4 5, highest: 1 2 3 4 5
##
## Value 1 2 3 4 5
## Frequency 503 86 262 351 957
## Proportion 0.233 0.040 0.121 0.163 0.443
## --------------------------------------------------------------------------------
## d.mort
## n missing distinct Info Sum Mean Gmd
## 2159 0 2 0.536 503 0.233 0.3576
##
## --------------------------------------------------------------------------------
## d.unfav
## n missing distinct Info Sum Mean Gmd
## 2159 0 2 0.716 851 0.3942 0.4778
##
## --------------------------------------------------------------------------------
## cause
## n missing distinct Info Mean Gmd
## 2159 0 5 0.921 4.908 2.301
##
## lowest : 3 4 5 6 9, highest: 3 4 5 6 9
##
## Value 3 4 5 6 9
## Frequency 848 420 134 370 387
## Proportion 0.393 0.195 0.062 0.171 0.179
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 64 0.999 33.21 15.17 17 18
## .25 .50 .75 .90 .95
## 22 30 43 54 59
##
## lowest : 14 15 16 17 18, highest: 74 75 76 77 79
## --------------------------------------------------------------------------------
## d.motor
## n missing distinct Info Mean Gmd
## 2159 0 6 0.919 3.991 1.226
##
## lowest : 1 2 3 4 5, highest: 2 3 4 5 6
##
## Value 1 2 3 4 5 6
## Frequency 14 279 369 627 790 80
## Proportion 0.006 0.129 0.171 0.290 0.366 0.037
## --------------------------------------------------------------------------------
## d.pupil
## n missing distinct Info Mean Gmd
## 2036 123 3 0.647 1.458 0.6881
##
## Value 1 2 3
## Frequency 1430 279 327
## Proportion 0.702 0.137 0.161
## --------------------------------------------------------------------------------
## pupil.i
## n missing distinct Info Mean Gmd
## 2159 0 3 0.65 1.462 0.6915
##
## Value 1 2 3
## Frequency 1511 299 349
## Proportion 0.700 0.138 0.162
## --------------------------------------------------------------------------------
## hypoxia
## n missing distinct Info Sum Mean Gmd
## 1910 249 2 0.509 414 0.2168 0.3397
##
## --------------------------------------------------------------------------------
## hypotens
## n missing distinct Info Sum Mean Gmd
## 2100 59 2 0.442 377 0.1795 0.2947
##
## --------------------------------------------------------------------------------
## ctclass
## n missing distinct Info Mean Gmd
## 2134 25 6 0.928 3.281 1.689
##
## lowest : 1 2 3 4 5, highest: 2 3 4 5 6
##
## Value 1 2 3 4 5 6
## Frequency 149 781 414 85 521 184
## Proportion 0.070 0.366 0.194 0.040 0.244 0.086
## --------------------------------------------------------------------------------
## tsah
## n missing distinct Info Sum Mean Gmd
## 2063 96 2 0.749 986 0.4779 0.4993
##
## --------------------------------------------------------------------------------
## edh
## n missing distinct Info Sum Mean Gmd
## 2121 38 2 0.332 269 0.1268 0.2216
##
## --------------------------------------------------------------------------------
## cisterns
## n missing distinct Info Mean Gmd
## 1902 257 3 0.716 1.455 0.6367
##
## Value 1 2 3
## Frequency 1223 492 187
## Proportion 0.643 0.259 0.098
## --------------------------------------------------------------------------------
## shift
## n missing distinct Info Sum Mean Gmd
## 1907 252 2 0.478 379 0.1987 0.3187
##
## --------------------------------------------------------------------------------
## d.sysbpt
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 1566 1 131.4 17.61 107.0 112.2
## .25 .50 .75 .90 .95
## 121.1 131.3 142.1 151.1 156.2
##
## lowest : 60.00 64.19 65.60 71.33 73.71, highest: 182.92 184.16 184.70 188.29 207.40
## --------------------------------------------------------------------------------
## glucose
## n missing distinct Info Mean Gmd .05 .10
## 2095 64 431 1 8.908 3.56 5.000 5.633
## .25 .50 .75 .90 .95
## 6.700 8.200 10.400 13.056 15.350
##
## lowest : 0.5555556 1.0000000 1.0300000 1.0400000 1.2200000
## highest: 29.2222222 30.8888889 31.1111111 32.5555556 41.4000000
## --------------------------------------------------------------------------------
## glucoset
## n missing distinct Info Mean Gmd .05 .10
## 2095 64 381 1 8.863 3.405 5.000 5.633
## .25 .50 .75 .90 .95
## 6.700 8.200 10.400 13.056 15.350
##
## lowest : 3.000000 3.030000 3.055556 3.200000 3.444444
## highest: 19.500000 19.555556 19.777778 19.800000 20.000000
## --------------------------------------------------------------------------------
## ph
## n missing distinct Info Mean Gmd .05 .10
## 1702 457 112 0.999 7.376 0.1183 7.19 7.24
## .25 .50 .75 .90 .95
## 7.32 7.38 7.45 7.50 7.54
##
## lowest : 6.79 6.83 6.87 6.90 6.91, highest: 7.62 7.63 7.64 7.65 7.66
## --------------------------------------------------------------------------------
## sodium
## n missing distinct Info Mean Gmd .05 .10
## 2097 62 62 0.993 139.3 4.448 132 134
## .25 .50 .75 .90 .95
## 137 140 142 144 145
##
## lowest : 114.0 117.0 118.0 121.0 123.0, highest: 150.0 151.0 151.7 153.0 154.0
## --------------------------------------------------------------------------------
## sodiumt
## n missing distinct Info Mean Gmd .05 .10
## 2097 62 55 0.993 139.3 4.408 132 134
## .25 .50 .75 .90 .95
## 137 140 142 144 145
##
## lowest : 125.0 126.0 126.5 127.0 127.4, highest: 150.0 151.0 151.7 153.0 154.0
## --------------------------------------------------------------------------------
## hb
## n missing distinct Info Mean Gmd .05 .10
## 2135 24 156 1 12.45 2.846 7.7 8.8
## .25 .50 .75 .90 .95
## 10.9 12.9 14.3 15.3 15.9
##
## lowest : 2.5 2.8 3.4 3.6 4.0, highest: 17.3 17.4 17.6 17.7 18.6
## --------------------------------------------------------------------------------
## hbt
## n missing distinct Info Mean Gmd .05 .10
## 2135 24 130 1 12.47 2.809 7.7 8.8
## .25 .50 .75 .90 .95
## 10.9 12.9 14.3 15.3 15.9
##
## lowest : 6.0 6.1 6.2 6.3 6.4, highest: 16.6 16.7 16.8 16.9 17.0
## --------------------------------------------------------------------------------
##################################
# Performing re-categorization and re-grouping
# of factor variables
##################################
TBI$pupil <- ifelse(TBI$d.pupil==9,NA,TBI$d.pupil)
TBI$motor <- ifelse(is.na(TBI$d.motor),9,TBI$d.motor)
TBI$motor1 <- ifelse(TBI$motor==1,1,0)
TBI$motor2 <- ifelse(TBI$motor==2,1,0)
TBI$motor3 <- ifelse(TBI$motor==3,1,0)
TBI$motor4 <- ifelse(TBI$motor==4,1,0)
TBI$motor56 <- ifelse(TBI$motor==5 | TBI$motor==6,1,0)
TBI$motor9 <- ifelse(TBI$motor==9,1,0)
TBI$motorG <- ifelse(TBI$motor1==1,5,
ifelse(TBI$motor2==1,4,
ifelse(TBI$motor3==1,3,
ifelse(TBI$motor4==1,2,
ifelse(TBI$motor56==1,1,
ifelse(TBI$motor9==1,9,NA))))))
TBI$motorG <- as.factor(TBI$motorG)
TBI$mort <- ifelse(TBI$d.gos==1,1,ifelse(TBI$d.gos==6,NA,0))
TBI$deadveg <- ifelse(TBI$d.gos<3,1,ifelse(TBI$d.gos==6,NA,0))
TBI$unfav <- ifelse(TBI$d.gos<4,1,ifelse(TBI$d.gos==6,NA,0))
TBI$good <- ifelse(TBI$d.gos<5,1,ifelse(TBI$d.gos==6,NA,0))
TBI$ctclass2 <- ifelse(TBI$ctclass==2,1,0)
TBI$ctclass1 <- ifelse(TBI$ctclass==1,1,0)
TBI$ctclass34 <- ifelse(TBI$ctclass==3 | TBI$ctclass==4,1,0)
TBI$ctclass56 <- ifelse(TBI$ctclass==5 | TBI$ctclass==6,1,0)
TBI$ctclassr4 <- ifelse(TBI$ctclass1==1,2,
ifelse(TBI$ctclass2==1,1,
ifelse(TBI$ctclass34==1,3,
ifelse(TBI$ctclass56==1,4,NA))))
TBI$ctclassr4 <- as.factor(TBI$ctclassr4)
TBI$ctclassr3 <- ifelse(TBI$ctclass1==1 | TBI$ctclass2==1,1,
ifelse(TBI$ctclass34==1,2,
ifelse(TBI$ctclass56==1,3,NA)))
TBI$ctclassr3 <- as.factor(TBI$ctclassr3)
TBI.Analysis <- TBI[,Cs(trial,
age,
hypoxia,
hypotens,
cisterns,
shift,
tsah,
edh,
pupil,
motorG,
ctclassr3,
d.sysbpt,
hbt,
glucoset,
unfav,
mort)]
names(TBI.Analysis) <- Cs(trial,
age,
hypoxia,
hypotens,
cisterns,
shift,
tsah,
edh,
pupil,
motor,
ctclass,
d.sysbp,
hb,
glucose,
unfav,
mort )
TBI.Analysis$cisterns <- ifelse(TBI.Analysis$cisterns>1,1,0)
TBI.Analysis$trial <- as.factor(TBI.Analysis$trial)
TBI.Analysis$age <- as.numeric(TBI.Analysis$age)
TBI.Analysis$hypoxia <-as.factor(TBI.Analysis$hypoxia)
TBI.Analysis$hypotens <-as.factor(TBI.Analysis$hypotens)
TBI.Analysis$cisterns <-as.factor(TBI.Analysis$cisterns)
TBI.Analysis$shift <-as.factor(TBI.Analysis$shift)
TBI.Analysis$tsah <-as.factor(TBI.Analysis$tsah)
TBI.Analysis$edh <-as.factor(TBI.Analysis$edh)
TBI.Analysis$pupil <-as.factor(TBI.Analysis$pupil)
TBI.Analysis$motor <-as.factor(TBI.Analysis$motor)
TBI.Analysis$ctclass <-as.factor(TBI.Analysis$ctclass)
TBI.Analysis$d.sysbp <-as.numeric(TBI.Analysis$d.sysbp)
TBI.Analysis$hb <-as.numeric(TBI.Analysis$hb)
TBI.Analysis$glucose <-as.numeric(TBI.Analysis$glucose)
TBI.Analysis$unfav <-as.factor(TBI.Analysis$unfav)
TBI.Analysis$mort <-as.factor(TBI.Analysis$mort)
dim(TBI.Analysis)## [1] 2159 16
describe(TBI.Analysis)## TBI.Analysis
##
## 16 Variables 2159 Observations
## --------------------------------------------------------------------------------
## trial
## n missing distinct
## 2159 0 2
##
## Value 74 75
## Frequency 1118 1041
## Proportion 0.518 0.482
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 64 0.999 33.21 15.17 17 18
## .25 .50 .75 .90 .95
## 22 30 43 54 59
##
## lowest : 14 15 16 17 18, highest: 74 75 76 77 79
## --------------------------------------------------------------------------------
## hypoxia
## n missing distinct
## 1910 249 2
##
## Value 0 1
## Frequency 1496 414
## Proportion 0.783 0.217
## --------------------------------------------------------------------------------
## hypotens
## n missing distinct
## 2100 59 2
##
## Value 0 1
## Frequency 1723 377
## Proportion 0.82 0.18
## --------------------------------------------------------------------------------
## cisterns
## n missing distinct
## 1902 257 2
##
## Value 0 1
## Frequency 1223 679
## Proportion 0.643 0.357
## --------------------------------------------------------------------------------
## shift
## n missing distinct
## 1907 252 2
##
## Value 0 1
## Frequency 1528 379
## Proportion 0.801 0.199
## --------------------------------------------------------------------------------
## tsah
## n missing distinct
## 2063 96 2
##
## Value 0 1
## Frequency 1077 986
## Proportion 0.522 0.478
## --------------------------------------------------------------------------------
## edh
## n missing distinct
## 2121 38 2
##
## Value 0 1
## Frequency 1852 269
## Proportion 0.873 0.127
## --------------------------------------------------------------------------------
## pupil
## n missing distinct
## 2036 123 3
##
## Value 1 2 3
## Frequency 1430 279 327
## Proportion 0.702 0.137 0.161
## --------------------------------------------------------------------------------
## motor
## n missing distinct
## 2159 0 5
##
## lowest : 1 2 3 4 5, highest: 1 2 3 4 5
##
## Value 1 2 3 4 5
## Frequency 870 627 369 279 14
## Proportion 0.403 0.290 0.171 0.129 0.006
## --------------------------------------------------------------------------------
## ctclass
## n missing distinct
## 2134 25 3
##
## Value 1 2 3
## Frequency 930 499 705
## Proportion 0.436 0.234 0.330
## --------------------------------------------------------------------------------
## d.sysbp
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 1566 1 131.4 17.61 107.0 112.2
## .25 .50 .75 .90 .95
## 121.1 131.3 142.1 151.1 156.2
##
## lowest : 60.00 64.19 65.60 71.33 73.71, highest: 182.92 184.16 184.70 188.29 207.40
## --------------------------------------------------------------------------------
## hb
## n missing distinct Info Mean Gmd .05 .10
## 2135 24 130 1 12.47 2.809 7.7 8.8
## .25 .50 .75 .90 .95
## 10.9 12.9 14.3 15.3 15.9
##
## lowest : 6.0 6.1 6.2 6.3 6.4, highest: 16.6 16.7 16.8 16.9 17.0
## --------------------------------------------------------------------------------
## glucose
## n missing distinct Info Mean Gmd .05 .10
## 2095 64 381 1 8.863 3.405 5.000 5.633
## .25 .50 .75 .90 .95
## 6.700 8.200 10.400 13.056 15.350
##
## lowest : 3.000000 3.030000 3.055556 3.200000 3.444444
## highest: 19.500000 19.555556 19.777778 19.800000 20.000000
## --------------------------------------------------------------------------------
## unfav
## n missing distinct
## 2159 0 2
##
## Value 0 1
## Frequency 1308 851
## Proportion 0.606 0.394
## --------------------------------------------------------------------------------
## mort
## n missing distinct
## 2159 0 2
##
## Value 0 1
## Frequency 1656 503
## Proportion 0.767 0.233
## --------------------------------------------------------------------------------
summary(TBI.Analysis)## trial age hypoxia hypotens cisterns shift
## 74:1118 Min. :14.00 0 :1496 0 :1723 0 :1223 0 :1528
## 75:1041 1st Qu.:22.00 1 : 414 1 : 377 1 : 679 1 : 379
## Median :30.00 NA's: 249 NA's: 59 NA's: 257 NA's: 252
## Mean :33.21
## 3rd Qu.:43.00
## Max. :79.00
##
## tsah edh pupil motor ctclass d.sysbp
## 0 :1077 0 :1852 1 :1430 1:870 1 :930 Min. : 60.0
## 1 : 986 1 : 269 2 : 279 2:627 2 :499 1st Qu.:121.1
## NA's: 96 NA's: 38 3 : 327 3:369 3 :705 Median :131.3
## NA's: 123 4:279 NA's: 25 Mean :131.4
## 5: 14 3rd Qu.:142.1
## Max. :207.4
##
## hb glucose unfav mort
## Min. : 6.00 Min. : 3.000 0:1308 0:1656
## 1st Qu.:10.90 1st Qu.: 6.700 1: 851 1: 503
## Median :12.90 Median : 8.200
## Mean :12.47 Mean : 8.863
## 3rd Qu.:14.30 3rd Qu.:10.400
## Max. :17.00 Max. :20.000
## NA's :24 NA's :64
##################################
# Formulating a data type assessment summary
##################################
PDA <- TBI.Analysis
(PDA.Summary <- data.frame(
Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)## Column.Index Column.Name Column.Type
## 1 1 trial factor
## 2 2 age numeric
## 3 3 hypoxia factor
## 4 4 hypotens factor
## 5 5 cisterns factor
## 6 6 shift factor
## 7 7 tsah factor
## 8 8 edh factor
## 9 9 pupil factor
## 10 10 motor factor
## 11 11 ctclass factor
## 12 12 d.sysbp numeric
## 13 13 hb numeric
## 14 14 glucose numeric
## 15 15 unfav factor
## 16 16 mort factor
##################################
# Loading dataset
##################################
DQA <- TBI.Analysis
##################################
# Listing all predictors
##################################
DQA.Predictors <- DQA[,names(DQA)]
##################################
# Formulating an overall data quality assessment summary
##################################
(DQA.Summary <- data.frame(
Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 trial factor 2159 0 1.000
## 2 2 age numeric 2159 0 1.000
## 3 3 hypoxia factor 2159 249 0.885
## 4 4 hypotens factor 2159 59 0.973
## 5 5 cisterns factor 2159 257 0.881
## 6 6 shift factor 2159 252 0.883
## 7 7 tsah factor 2159 96 0.956
## 8 8 edh factor 2159 38 0.982
## 9 9 pupil factor 2159 123 0.943
## 10 10 motor factor 2159 0 1.000
## 11 11 ctclass factor 2159 25 0.988
## 12 12 d.sysbp numeric 2159 0 1.000
## 13 13 hb numeric 2159 24 0.989
## 14 14 glucose numeric 2159 64 0.970
## 15 15 unfav factor 2159 0 1.000
## 16 16 mort factor 2159 0 1.000
##################################
# Listing all numeric predictors
##################################
DQA.Predictors.Numeric <- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Numeric))),
" numeric predictor variable(s)."))
} else {
print("There are no numeric predictor variables.")
}## [1] "There are 4 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
DQA.Predictors.Factor <- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Factor))),
" factor predictor variable(s)."))
} else {
print("There are no factor predictor variables.")
}## [1] "There are 12 factor predictor variable(s)."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = x[!(x %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
usm[tabsm == max(tabsm)]
}
(DQA.Predictors.Factor.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
} ## Column.Name Column.Type Unique.Count First.Mode.Value Second.Mode.Value
## 1 trial factor 2 74 75
## 2 hypoxia factor 3 0 1
## 3 hypotens factor 3 0 1
## 4 cisterns factor 3 0 1
## 5 shift factor 3 0 1
## 6 tsah factor 3 0 1
## 7 edh factor 3 0 1
## 8 pupil factor 4 1 3
## 9 motor factor 5 1 2
## 10 ctclass factor 4 1 3
## 11 unfav factor 2 0 1
## 12 mort factor 2 0 1
## First.Mode.Count Second.Mode.Count Unique.Count.Ratio
## 1 1118 1041 0.001
## 2 1496 414 0.001
## 3 1723 377 0.001
## 4 1223 679 0.001
## 5 1528 379 0.001
## 6 1077 986 0.001
## 7 1852 269 0.001
## 8 1430 327 0.002
## 9 870 627 0.002
## 10 930 705 0.002
## 11 1308 851 0.001
## 12 1656 503 0.001
## First.Second.Mode.Ratio
## 1 1.074
## 2 3.614
## 3 4.570
## 4 1.801
## 5 4.032
## 6 1.092
## 7 6.885
## 8 4.373
## 9 1.388
## 10 1.319
## 11 1.537
## 12 3.292
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = na.omit(x)[!(na.omit(x) %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
usm[tabsm == max(tabsm)]
}
(DQA.Predictors.Numeric.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(moments::skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(moments::kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 age numeric 64 0.030 21.000
## 2 d.sysbp numeric 1566 0.725 130.000
## 3 hb numeric 131 0.061 13.900
## 4 glucose numeric 382 0.177 20.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 19.000 104 100 1.040
## 2 120.000 9 8 1.125
## 3 13.600 47 43 1.093
## 4 6.500 29 27 1.074
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th
## 1 14.000 33.211 30.000 79.000 0.721 2.585 22.000
## 2 60.000 131.430 131.280 207.400 -0.022 3.982 121.085
## 3 6.000 12.468 12.900 17.000 -0.619 2.786 10.900
## 4 3.000 8.863 8.200 20.000 1.192 4.727 6.700
## Percentile75th
## 1 43.000
## 2 142.060
## 3 14.300
## 4 10.400
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
(nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
DQA.Summary[DQA.Summary$NA.Count>0,]
} else {
print("No missing observations noted.")
}## [1] "Missing observations noted for 10 variable(s) with NA.Count>0 and Fill.Rate<1.0."
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 3 3 hypoxia factor 2159 249 0.885
## 4 4 hypotens factor 2159 59 0.973
## 5 5 cisterns factor 2159 257 0.881
## 6 6 shift factor 2159 252 0.883
## 7 7 tsah factor 2159 96 0.956
## 8 8 edh factor 2159 38 0.982
## 9 9 pupil factor 2159 123 0.943
## 11 11 ctclass factor 2159 25 0.988
## 13 13 hb numeric 2159 24 0.989
## 14 14 glucose numeric 2159 64 0.970
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
} else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
" factor variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance factor predictors due to high first-second mode ratio noted.")
}## [1] "Low variance observed for 1 factor variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count First.Mode.Value Second.Mode.Value
## 7 edh factor 3 0 1
## First.Mode.Count Second.Mode.Count Unique.Count.Ratio First.Second.Mode.Ratio
## 7 1852 269 0.001 6.885
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
" numeric variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance numeric predictors due to high first-second mode ratio noted.")
}## [1] "No low variance numeric predictors due to high first-second mode ratio noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
" numeric variable(s) with Unique.Count.Ratio<0.01."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
} else {
print("No low variance numeric predictors due to low unique count ratio noted.")
}## [1] "No low variance numeric predictors due to low unique count ratio noted."
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
} else {
print("No skewed numeric predictors noted.")
}## [1] "No skewed numeric predictors noted."
##################################
# Loading dataset
##################################
MDPA <- TBI.Analysis
##################################
# Reconstructing the dataset by :
# removing variables with fill rate > 0.50
# removing observations with NA ratio > 0.80
# converting factor variables to numeric variables
##################################
MDPA.Cleaned <- missCompare::clean(MDPA,
var_removal_threshold=0.50,
ind_removal_threshold=0.80)
##################################
# Gathering the metadata using the reconstructed dataset
##################################
MDPA.Cleaned.Metadata <- missCompare::get_data(MDPA.Cleaned,
matrixplot_sort = T,
plot_transform = T)
##################################
# Gathering the number of complete cases
##################################
MDPA.Cleaned.Metadata$Complete_cases## [1] 1431
MDPA.Cleaned.Metadata$Rows## [1] 2159
MDPA.Cleaned.Metadata$Columns## [1] 16
MDPA.Cleaned.Metadata$Complete_cases/MDPA.Cleaned.Metadata$Rows## [1] 0.6628069
##################################
# Gathering the number of incomplete cases
##################################
MDPA.Cleaned.Metadata$Total_NA## [1] 1187
MDPA.Cleaned.Metadata$NA_per_variable## trial age hypoxia hypotens cisterns shift tsah edh
## 0 0 249 59 257 252 96 38
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 123 0 25 0 24 64 0 0
MDPA.Cleaned.Metadata$Fraction_missingness## [1] 0.03436197
MDPA.Cleaned.Metadata$Fraction_missingness_per_variable## trial age hypoxia hypotens cisterns shift tsah
## 0.00000000 0.00000000 0.11533117 0.02732747 0.11903659 0.11672070 0.04446503
## edh pupil motor ctclass d.sysbp hb glucose
## 0.01760074 0.05697082 0.00000000 0.01157943 0.00000000 0.01111626 0.02964335
## unfav mort
## 0.00000000 0.00000000
##################################
# Gathering the missing data patterns
##################################
MDPA.Cleaned.Metadata$Matrix_plotMDPA.Cleaned.Metadata$Cluster_plotMDPA.Cleaned.Metadata$Corr_matrix## trial age hypoxia hypotens cisterns
## trial 1.000000000 -0.030651416 1.656750e-01 0.1050912318 0.0090874012
## age -0.030651416 1.000000000 1.031147e-03 0.0797683643 0.0513284240
## hypoxia 0.165675039 0.001031147 1.000000e+00 0.2323622569 0.0460174467
## hypotens 0.105091232 0.079768364 2.323623e-01 1.0000000000 0.0004308286
## cisterns 0.009087401 0.051328424 4.601745e-02 0.0004308286 1.0000000000
## shift -0.065358546 0.104812735 4.487131e-20 -0.0549581210 0.4585598379
## tsah -0.097721950 0.174014741 1.627862e-02 -0.0075673746 0.0329649255
## edh -0.119425686 0.002670862 -4.682238e-02 -0.0770647036 0.0237954129
## pupil 0.088381902 0.016622292 1.438070e-01 0.1137020246 0.1464712963
## motor -0.051699973 -0.008694701 1.885604e-01 0.0656417934 0.1168144834
## ctclass -0.015770576 0.153012007 1.850571e-02 -0.0194182715 0.2788307306
## d.sysbp 0.087643656 0.012554957 -1.094494e-01 -0.2634324171 -0.0114882127
## hb 0.227845969 -0.031832004 -5.386741e-02 -0.2705358063 -0.0417687110
## glucose 0.090165100 0.069076332 1.619970e-01 0.1994708499 0.0498702817
## unfav -0.029068950 0.208375143 1.742552e-01 0.1776370570 0.1396093897
## mort -0.038439923 0.153665440 1.414791e-01 0.1666659119 0.1424181948
## shift tsah edh pupil motor
## trial -6.535855e-02 -0.097721950 -0.119425686 0.08838190 -0.051699973
## age 1.048127e-01 0.174014741 0.002670862 0.01662229 -0.008694701
## hypoxia 4.487131e-20 0.016278621 -0.046822378 0.14380701 0.188560393
## hypotens -5.495812e-02 -0.007567375 -0.077064704 0.11370202 0.065641793
## cisterns 4.585598e-01 0.032964926 0.023795413 0.14647130 0.116814483
## shift 1.000000e+00 0.090218463 0.041278708 0.09772835 0.088389352
## tsah 9.021846e-02 1.000000000 -0.010786454 0.14788861 0.116778650
## edh 4.127871e-02 -0.010786454 1.000000000 -0.02558442 -0.046758976
## pupil 9.772835e-02 0.147888607 -0.025584420 1.00000000 0.381055618
## motor 8.838935e-02 0.116778650 -0.046758976 0.38105562 1.000000000
## ctclass 2.793684e-01 0.191225179 0.296787667 0.21786366 0.167952008
## d.sysbp 9.615297e-03 -0.022296366 -0.015739545 -0.08779228 -0.123171876
## hb -3.378804e-02 0.030632398 -0.019653199 -0.08586573 -0.064731488
## glucose 2.573764e-02 0.102781495 -0.001029329 0.18136723 0.128402858
## unfav 1.340827e-01 0.249226529 -0.065700956 0.31905926 0.353979268
## mort 1.612055e-01 0.229655718 -0.061375715 0.25297999 0.267972519
## ctclass d.sysbp hb glucose unfav
## trial -0.015770576 0.087643656 0.227845969 0.090165100 -0.02906895
## age 0.153012007 0.012554957 -0.031832004 0.069076332 0.20837514
## hypoxia 0.018505712 -0.109449414 -0.053867408 0.161996976 0.17425522
## hypotens -0.019418271 -0.263432417 -0.270535806 0.199470850 0.17763706
## cisterns 0.278830731 -0.011488213 -0.041768711 0.049870282 0.13960939
## shift 0.279368362 0.009615297 -0.033788043 0.025737640 0.13408269
## tsah 0.191225179 -0.022296366 0.030632398 0.102781495 0.24922653
## edh 0.296787667 -0.015739545 -0.019653199 -0.001029329 -0.06570096
## pupil 0.217863665 -0.087792278 -0.085865732 0.181367225 0.31905926
## motor 0.167952008 -0.123171876 -0.064731488 0.128402858 0.35397927
## ctclass 1.000000000 -0.015114873 -0.000441974 0.123201094 0.24639544
## d.sysbp -0.015114873 1.000000000 0.195736194 -0.125362570 -0.09119451
## hb -0.000441974 0.195736194 1.000000000 -0.134613174 -0.15508193
## glucose 0.123201094 -0.125362570 -0.134613174 1.000000000 0.22878822
## unfav 0.246395438 -0.091194506 -0.155081925 0.228788217 1.00000000
## mort 0.222675184 -0.178138387 -0.162844394 0.229159186 0.68327089
## mort
## trial -0.03843992
## age 0.15366544
## hypoxia 0.14147910
## hypotens 0.16666591
## cisterns 0.14241819
## shift 0.16120554
## tsah 0.22965572
## edh -0.06137571
## pupil 0.25297999
## motor 0.26797252
## ctclass 0.22267518
## d.sysbp -0.17813839
## hb -0.16284439
## glucose 0.22915919
## unfav 0.68327089
## mort 1.00000000
MDPA.Cleaned.Metadata$MD_Pattern## trial age hypoxia hypotens cisterns shift tsah edh pupil motor ctclass
## 1431 1 1 1 1 1 1 1 1 1 1 1
## 5 1 1 1 1 0 1 1 1 1 1 1
## 1 1 1 1 1 1 0 1 1 1 1 1
## 200 1 1 1 1 0 0 1 1 1 1 1
## 157 1 1 0 1 1 1 1 1 1 1 1
## 12 1 1 0 1 0 0 1 1 1 1 1
## 82 1 1 1 1 1 1 1 1 0 1 1
## 12 1 1 1 1 0 0 1 1 0 1 1
## 10 1 1 0 1 1 1 1 1 0 1 1
## 1 1 1 0 1 0 1 1 1 0 1 1
## 42 1 1 1 1 1 1 0 1 1 1 1
## 9 1 1 1 1 0 0 0 1 1 1 1
## 11 1 1 0 1 1 1 0 1 1 1 1
## 5 1 1 1 1 1 1 0 1 0 1 1
## 34 1 1 1 1 1 1 1 1 1 1 1
## 8 1 1 0 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 0 1 1
## 2 1 1 1 1 1 1 0 1 1 1 1
## 10 1 1 1 0 1 1 1 1 1 1 1
## 27 1 1 0 0 1 1 1 1 1 1 1
## 2 1 1 0 0 0 0 1 1 1 1 1
## 3 1 1 0 0 1 1 1 1 0 1 1
## 2 1 1 1 0 1 1 0 1 1 1 1
## 2 1 1 0 0 1 1 0 1 1 1 1
## 3 1 1 1 0 1 1 1 1 1 1 1
## 1 1 1 0 0 1 1 1 1 1 1 1
## 1 1 1 0 0 1 1 0 1 1 1 1
## 7 1 1 1 1 1 1 1 0 1 1 1
## 3 1 1 1 1 0 0 1 0 1 1 1
## 3 1 1 0 1 1 1 1 0 1 1 1
## 1 1 1 1 1 1 1 1 0 0 1 1
## 1 1 1 0 1 1 1 1 0 0 1 1
## 10 1 1 1 1 1 1 0 0 1 1 1
## 4 1 1 1 1 0 0 0 0 1 1 1
## 1 1 1 0 1 1 1 0 0 1 1 1
## 1 1 1 1 1 1 1 0 0 0 1 1
## 1 1 1 0 1 1 1 0 0 1 1 1
## 1 1 1 1 0 1 1 1 0 1 1 1
## 1 1 1 0 0 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1 0 0 1 1 1
## 11 1 1 1 1 1 1 1 1 1 1 0
## 2 1 1 1 1 0 0 1 1 1 1 0
## 4 1 1 0 1 1 1 1 1 1 1 0
## 3 1 1 1 1 1 1 1 1 0 1 0
## 1 1 1 1 0 1 1 1 1 1 1 0
## 1 1 1 1 0 0 0 1 1 1 1 0
## 1 1 1 1 1 1 1 1 0 1 1 0
## 2 1 1 1 1 1 1 0 0 1 1 0
## 5 1 1 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 0 0 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 0 1 1
## 7 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 0 0 1 1 1 1 1
## 1 1 1 0 1 1 1 1 1 1 1 1
## 1 1 1 1 1 0 0 1 1 0 1 1
## 1 1 1 1 1 1 1 0 1 1 1 1
## 2 1 1 0 0 1 1 1 1 1 1 1
## 1 1 1 1 0 1 1 0 1 1 1 1
## 0 0 249 59 257 252 96 38 123 0 25
## d.sysbp hb glucose unfav mort
## 1431 1 1 1 1 1
## 5 1 1 1 1 1
## 1 1 1 1 1 1
## 200 1 1 1 1 1
## 157 1 1 1 1 1
## 12 1 1 1 1 1
## 82 1 1 1 1 1
## 12 1 1 1 1 1
## 10 1 1 1 1 1
## 1 1 1 1 1 1
## 42 1 1 1 1 1
## 9 1 1 1 1 1
## 11 1 1 1 1 1
## 5 1 1 1 1 1
## 34 1 1 0 1 1
## 8 1 1 0 1 1
## 2 1 1 0 1 1
## 2 1 1 0 1 1
## 10 1 1 1 1 1
## 27 1 1 1 1 1
## 2 1 1 1 1 1
## 3 1 1 1 1 1
## 2 1 1 1 1 1
## 2 1 1 1 1 1
## 3 1 1 0 1 1
## 1 1 1 0 1 1
## 1 1 1 0 1 1
## 7 1 1 1 1 1
## 3 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 10 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 0 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 11 1 1 1 1 1
## 2 1 1 1 1 1
## 4 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 0 1 1 1
## 4 1 0 1 1 1
## 1 1 0 1 1 1
## 7 1 0 0 1 1
## 1 1 0 0 1 1
## 1 1 0 0 1 1
## 1 1 0 0 1 1
## 1 1 0 0 1 1
## 2 1 0 1 1 1
## 1 1 0 0 1 1
## 0 24 64 0 0
dim(MDPA.Cleaned.Metadata$MD_Pattern)## [1] 59 16
MDPA.Cleaned.Metadata$NA_Correlations## trial age hypoxia hypotens
## trial_is.na NaN NaN NaN NaN
## age_is.na NaN NaN NaN NaN
## hypoxia_is.na 0.003075342 -1.991261e-02 NaN -0.050871031
## hypotens_is.na -0.003139022 9.723747e-03 -0.070715627 NaN
## cisterns_is.na -0.020274316 5.403308e-03 -0.036463860 0.021305289
## shift_is.na -0.012972969 2.893106e-03 -0.040060700 0.018044005
## tsah_is.na -0.048171238 1.924787e-02 -0.067614722 -0.098680649
## edh_is.na 0.023420299 5.196470e-03 -0.043045790 -0.065097712
## pupil_is.na 0.037218347 -4.342998e-02 -0.030756342 -0.018479920
## motor_is.na NaN NaN NaN NaN
## ctclass_is.na -0.008194874 -8.522036e-03 -0.005461126 -0.046146806
## d.sysbp_is.na NaN NaN NaN NaN
## hb_is.na -0.039147773 2.245985e-05 -0.029832037 -0.002868117
## glucose_is.na 0.092140222 1.110012e-03 -0.005690478 0.003122214
## unfav_is.na NaN NaN NaN NaN
## mort_is.na NaN NaN NaN NaN
## cisterns shift tsah edh
## trial_is.na NaN NaN NaN NaN
## age_is.na NaN NaN NaN NaN
## hypoxia_is.na 0.005132467 -0.037162241 -8.090368e-03 0.0209094526
## hypotens_is.na 0.019420226 -0.014558767 -9.092611e-04 0.0009040846
## cisterns_is.na NaN 0.027979664 5.895326e-02 -0.0364386527
## shift_is.na -0.030781323 NaN 4.920710e-02 -0.0351378312
## tsah_is.na 0.035616071 -0.016129896 NaN 0.0201143139
## edh_is.na 0.017911716 0.012064215 3.759542e-02 NaN
## pupil_is.na 0.032634364 -0.017689375 -2.970642e-02 0.0320063995
## motor_is.na NaN NaN NaN NaN
## ctclass_is.na 0.008762896 0.004581525 -6.719949e-05 0.0390176196
## d.sysbp_is.na NaN NaN NaN NaN
## hb_is.na -0.029175385 0.007846099 4.863460e-03 -0.0128075625
## glucose_is.na -0.042431688 -0.027251502 -1.925087e-02 -0.0167735791
## unfav_is.na NaN NaN NaN NaN
## mort_is.na NaN NaN NaN NaN
## pupil motor ctclass d.sysbp hb
## trial_is.na NaN NaN NaN NaN NaN
## age_is.na NaN NaN NaN NaN NaN
## hypoxia_is.na -0.0383020501 -0.077961939 -0.018323675 0.06582073 0.02059243
## hypotens_is.na -0.0252221782 -0.029960226 -0.013411691 0.02261856 -0.02056744
## cisterns_is.na 0.0268112302 0.111893054 -0.006296335 -0.06654302 0.11436064
## shift_is.na 0.0192540664 0.105806117 -0.017223821 -0.06403836 0.12099795
## tsah_is.na -0.0308960246 -0.041118410 0.002862034 0.07287370 0.06924227
## edh_is.na -0.0298496078 -0.023869160 0.018298843 0.04278551 0.06563305
## pupil_is.na NaN -0.028658751 -0.020254575 0.02318343 0.03887609
## motor_is.na NaN NaN NaN NaN NaN
## ctclass_is.na -0.0750312122 -0.023662963 NaN 0.01626310 0.06052439
## d.sysbp_is.na NaN NaN NaN NaN NaN
## hb_is.na 0.0005132771 -0.003710246 -0.012940904 0.02107206 NaN
## glucose_is.na 0.0150930949 -0.020565988 -0.049791976 0.04466416 0.01725871
## unfav_is.na NaN NaN NaN NaN NaN
## mort_is.na NaN NaN NaN NaN NaN
## glucose unfav mort
## trial_is.na NaN NaN NaN
## age_is.na NaN NaN NaN
## hypoxia_is.na -0.004860708 -0.002531656 -0.02054161
## hypotens_is.na 0.011223556 0.013113682 0.01173212
## cisterns_is.na -0.046590315 0.033074703 0.03341347
## shift_is.na -0.045520597 0.024587698 0.02631154
## tsah_is.na -0.088678630 -0.055916708 -0.04589547
## edh_is.na -0.036218129 -0.021781268 -0.02622100
## pupil_is.na 0.010353762 -0.018474498 -0.02998735
## motor_is.na NaN NaN NaN
## ctclass_is.na -0.074101567 -0.063310212 -0.07348964
## d.sysbp_is.na NaN NaN NaN
## hb_is.na -0.008761202 -0.022962768 -0.03562051
## glucose_is.na NaN -0.015499980 -0.01349823
## unfav_is.na NaN NaN NaN
## mort_is.na NaN NaN NaN
MDPA.Cleaned.Metadata$NA_Correlation_plotMDPA.Cleaned.Metadata$min_PDM_thresholds## min_PDM_threshold perc_obs_retained
## 1 5 89
## 2 10 81
## 3 20 74
## 4 50 60
## 5 100 49
## 6 200 0
## 7 500 0
## 8 1000 0
MDPA.Cleaned.Metadata$Vars_above_half## character(0)
##################################
# Loading dataset
##################################
MDPA <- TBI.Analysis
##################################
# Testing the missingness mechanism using a regression-based approach
# MCAR (Missing Completely at Random) versus MAR (Missing at Random)
# for variables with missing data as evaluated against variables with complete observations
##################################
RBtest(TBI.Analysis)## $abs.nbrMD
## trial age hypoxia hypotens cisterns shift tsah edh
## 0 0 249 59 257 252 96 38
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 123 0 25 0 24 64 0 0
##
## $rel.nbrMD
## trial age hypoxia hypotens cisterns shift tsah edh
## 0.00 0.00 0.12 0.03 0.12 0.12 0.04 0.02
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 0.06 0.00 0.01 0.00 0.01 0.03 0.00 0.00
##
## $type
## trial age hypoxia hypotens cisterns shift tsah edh
## -1 -1 1 0 1 0 0 0
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 0 -1 1 -1 0 0 -1 -1
ways of interpreting PCA results are thus not applicable to NLPCA since the loadings are hidden in the network. However, the scores of components that lead to low cross-validation errors can still be interpreted via the score plot. | | Multivariate Imputation by Chained Equations as an imputation method is based on fully conditional specification, where each incomplete variable is imputed by a separate model. As a sequential regression imputation technique, the algorithm imputes an incomplete column (target column) by generating plausible synthetic values given other columns in the data. Each incomplete column must act as a target column, and has its own specific set of predictors. For predictors that are incomplete themselves, the most recently generated imputations are used to complete the predictors prior to prior to imputation of the target columns. | | Missing data imputation method evaluation: | | [A] The performance of 16 missing data imputation methods were evaluated on the simulated dataset using the impute_simulated method from the missCompare package. | [A.1] Method 1: Random replacement | [A.2] Method 2: Median imputation | [A.3] Method 3: Mean imputation | [A.4] Method 4: missMDA Regularized | [A.5] Method 5: missMDA EM | [A.6] Method 6: pcaMethods PPCA | [A.7] Method 7: pcaMethods svdImpute | [A.8] Method 8: pcaMethods BPCA | [A.9] Method 9: pcaMethods NIPALS | [A.10] Method 10: pcaMethods NLPCA | [A.11] Method 11: mice mixed | [A.12] Method 12: mi Bayesian | [A.13] Method 13: Amelia II | [A.14] Method 14: missForest | [A.15] Method 15: Hmisc aregImpute | [A.16] Method 16: VIM kNN | | [B] The missing data imputation methods from the missMDA and pcaMethods packages had the best MAE and RMSE performance. | | [C] The missing data imputation methods from the mice, mi, Amelia, missForest, Hmisc and VIM packages had the best KS test performance. | | [D] Due to the nature of the dataset containing both numeric and factor variables, the missing data imputation methods from the mice, Hmisc and VIM packages which offer more flexibility for mixed data types were chosen for the subsequent process. |
##################################
# Loading dataset
##################################
MDIE <- TBI.Analysis
##################################
# Reconstructing the dataset by :
# removing variables with fill rate > 0.50
# removing observations with NA ratio > 0.80
# converting factor variables to numeric variables
##################################
MDIE.Cleaned <- missCompare::clean(MDIE,
var_removal_threshold=0.50,
ind_removal_threshold=0.80)
##################################
# Gathering the metadata using the reconstructed dataset
##################################
MDIE.Cleaned.Metadata <- missCompare::get_data(MDIE.Cleaned,
matrixplot_sort = T,
plot_transform = T)
##################################
# Simulating a dataset using the metadata characteristics
# applying MCAR, MAR and MNAR mechanisms
##################################
MDIE.Simulated <- missCompare::simulate(rownum=MDIE.Cleaned.Metadata$Rows,
colnum=MDIE.Cleaned.Metadata$Columns,
cormat=MDIE.Cleaned.Metadata$Corr_matrix,
meanval=0,
sdval=1)
##################################
# Performing 16 imputation methods
# on the simulated dataset
# Method 1 : random replacement
# Method 2 : median imputation
# Method 3 : mean imputation
# Method 4 : missMDA Regularized
# Method 5 : missMDA EM
# Method 6 : pcaMethods PPCA
# Method 7 : pcaMethods svdImpute
# Method 8 : pcaMethods BPCA
# Method 9 : pcaMethods NIPALS
# Method 10 : pcaMethods NLPCA
# Method 11 : mice mixed
# Method 12 : mi Bayesian
# Method 13 : Amelia II
# Method 14 : missForest
# Method 15 : Hmisc aregImpute
# Method 16 : VIM kNN
##################################
##################################
# Process parameters defined as follows :
# Minimum number of observations per distinct missing data pattern = 10
# Number of iterations = 1 (for illustration only, actual number should be set high in real applications)
##################################
MDIE.Simulated.Imputed <- missCompare::impute_simulated(rownum=MDIE.Cleaned.Metadata$Rows,
colnum=MDIE.Cleaned.Metadata$Columns,
cormat=MDIE.Cleaned.Metadata$Corr_matrix,
MD_pattern=MDIE.Cleaned.Metadata$MD_Pattern,
NA_fraction=MDIE.Cleaned.Metadata$Fraction_missingness,
min_PDM=10,
n.iter=1,
assumed_pattern=NA)## [1] "ITERATION 1 OF TOTAL 1 - IN PROGRESS"
## [1] "random replacement imputation - in progress"
## [1] "Median imputation - in progress"
## [1] "Mean imputation - in progress"
## [1] "missMDA regularized imputation - in progress"
## [1] "missMDA EM imputation - in progress"
## [1] "pcaMethods PPCA imputation - in progress"
## [1] "pcaMethods svdImpute imputation - in progress"
## [1] "pcaMethods BPCA imputation - in progress"
## [1] "pcaMethods NIPALS imputation - in progress"
## [1] "pcaMethods NLPCA imputation - in progress"
## [1] "mice mixed imputation - in progress"
## [1] "mi imputation - in progress"
## [1] "Amelia II imputation - in progress"
## [1] "missForest imputation - in progress"
## [1] "Hmisc aregImpute imputation - in progress"
## [1] "VIM kNN imputation - in progress"
##################################
# Comparing the performance of the 16 imputation methods
# in terms of the process execution time
# per missing data mechanism
##################################
MDIE.Simulated.Imputed$Plot_TIME##################################
# Comparing the performance of the 16 imputation methods
# in terms of the root mean square error
# per missing data mechanism
##################################
MDIE.Simulated.Imputed$Plot_RMSE##################################
# Comparing the performance of the 16 imputation methods
# in terms of the mean absolute error
# per missing data mechanism
##################################
MDIE.Simulated.Imputed$Plot_MAE##################################
# Comparing the performance of the 16 imputation methods
# in terms of the kolmogorov smirnov statistic
# per missing data mechanism
##################################
MDIE.Simulated.Imputed$Plot_KS##################################
# Loading dataset
##################################
MDIPID <- TBI.Analysis
##################################
# Performing missing data imputation
# on the original dataset using
# Method 11 : mice mixed
##################################
##################################
# Process parameters defined as follows :
# Number of iterations = 3 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.MiceMixed <- missCompare::impute_data(MDIPID,
scale=F,
n.iter=3,
sel_method=c(11))## [1] "mice mixed imputation - in progress"
##################################
# Performing post-imputation diagnostics
# using Method 11 : mice mixed
##################################
##################################
# Process parameters defined as follows :
# Number of bootstrap cycles = 5 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.MiceMixed.Diagnostics <- missCompare::post_imp_diag(MDIPID,
MDIPID.Imputed.MiceMixed$mice_mixed_imputation[[1]],
scale=F,
n.boot=5)
##################################
# Comparing the histograms of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
MDIPID.Imputed.MiceMixed.Diagnostics$Histograms## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the box plots of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
MDIPID.Imputed.MiceMixed.Diagnostics$Boxplots## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the summary statistics of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
MDIPID.Imputed.MiceMixed.Diagnostics$Statistics## $age
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 33.21121 13.57760 NA NA NA
## KS_test
## NA
##
## $d.sysbp
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 131.42962 15.85943 NA NA NA
## KS_test
## NA
##
## $hb
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 12.4676112 2.5033293 12.5041667 2.6164163 0.9462852
## KS_test.D
## 0.1099532
##
## $glucose
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 8.8632612 3.1907564 9.3703125 3.3678757 0.2389571
## KS_test.D
## 0.1293034
##################################
# Comparing the correlation plots of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
MDIPID.Imputed.MiceMixed.Diagnostics$Correlation_plot##################################
# Comparing the bar charts of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
MDIPID.Imputed.MiceMixed.Diagnostics$Barcharts## $trial
##
## $hypoxia
##
## $hypotens
##
## $cisterns
##
## $shift
##
## $tsah
##
## $edh
##
## $pupil
##
## $motor
##
## $ctclass
##
## $unfav
##
## $mort
##################################
# Comparing the cluster plot of the variables
# between the imputed and original datasets
# from Method 11 : mice mixed
###############################
MDIPID.Imputed.MiceMixed.Diagnostics$Variable_clusters_impMDIPID.Imputed.MiceMixed.Diagnostics$Variable_clusters_orig##################################
# Loading dataset
##################################
MDIPID <- TBI.Analysis
##################################
# Performing missing data imputation
# on the original dataset using
# Method 15 : Hmisc aregImpute
##################################
##################################
# Process parameters defined as follows :
# Number of iterations = 3 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.HmiscAregImpute <- missCompare::impute_data(MDIPID,
scale=F,
n.iter=3,
sel_method=c(15))## [1] "Hmisc aregImpute imputation - in progress"
##################################
# Performing post-imputation diagnostics
# Method 15 : Hmisc aregImpute
##################################
##################################
# Process parameters defined as follows :
# Number of bootstrap cycles = 5 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics <- missCompare::post_imp_diag(MDIPID,
MDIPID.Imputed.HmiscAregImpute$Hmisc_aregImpute_imputation[[1]],
scale=F,
n.boot=5)
##################################
# Comparing the histograms of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Histograms## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the box plots of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Boxplots## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the summary statistics of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Statistics## $age
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 33.21121 13.57760 NA NA NA
## KS_test
## NA
##
## $d.sysbp
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 131.42962 15.85943 NA NA NA
## KS_test
## NA
##
## $hb
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 12.4676112 2.5033293 12.8166667 2.0202597 0.4096952
## KS_test.D
## 0.1677010
##
## $glucose
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 8.8632612 3.1907564 8.3757465 3.0807203 0.2171901
## KS_test.D
## 0.1358741
##################################
# Comparing the correlation plots of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Correlation_plot##################################
# Comparing the bar charts of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Barcharts## $trial
##
## $hypoxia
##
## $hypotens
##
## $cisterns
##
## $shift
##
## $tsah
##
## $edh
##
## $pupil
##
## $motor
##
## $ctclass
##
## $unfav
##
## $mort
##################################
# Comparing the cluster plot of the variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
###############################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Variable_clusters_impMDIPID.Imputed.HmiscAregImpute.Diagnostics$Variable_clusters_orig##################################
# Loading dataset
##################################
MDIPID <- TBI.Analysis
##################################
# Performing missing data imputation
# on the original dataset using
# Method 16 : VIM kNN
##################################
##################################
# Process parameters defined as follows :
# Number of iterations = 3 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.VIMkNN <- missCompare::impute_data(MDIPID,
scale=F,
n.iter=3,
sel_method=c(16))## [1] "VIM kNN imputation - in progress"
##################################
# Performing post-imputation diagnostics
# using Method 16 : VIM kNN
##################################
##################################
# Process parameters defined as follows :
# Number of bootstrap cycles = 5 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.VIMkNN.Diagnostics <- missCompare::post_imp_diag(MDIPID,
MDIPID.Imputed.VIMkNN$VIM_kNN_imputation[[1]],
scale=F,
n.boot=5)
##################################
# Comparing the histograms of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
MDIPID.Imputed.VIMkNN.Diagnostics$Histograms## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the box plots of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
MDIPID.Imputed.VIMkNN.Diagnostics$Boxplots## $age
##
## $d.sysbp
##
## $hb
##
## $glucose
##################################
# Comparing the summary statistics of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
MDIPID.Imputed.VIMkNN.Diagnostics$Statistics## $age
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 33.21121 13.57760 NA NA NA
## KS_test
## NA
##
## $d.sysbp
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 131.42962 15.85943 NA NA NA
## KS_test
## NA
##
## $hb
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 12.46761124 2.50332935 12.97291667 1.24542460 0.06316901
## KS_test.D
## 0.20618657
##
## $glucose
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 8.863261e+00 3.190756e+00 8.150738e+00 1.266623e+00 8.462556e-05
## KS_test.D
## 2.199806e-01
##################################
# Comparing the correlation plots of the numeric variables
# between the imputed and original datasets
# from Method 12 : VIM kNN
##################################
MDIPID.Imputed.VIMkNN.Diagnostics$Correlation_plot##################################
# Comparing the bar charts of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
MDIPID.Imputed.VIMkNN.Diagnostics$Barcharts## $trial
##
## $hypoxia
##
## $hypotens
##
## $cisterns
##
## $shift
##
## $tsah
##
## $edh
##
## $pupil
##
## $motor
##
## $ctclass
##
## $unfav
##
## $mort
##################################
# Comparing the cluster plot of the variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
###############################
MDIPID.Imputed.VIMkNN.Diagnostics$Variable_clusters_impMDIPID.Imputed.VIMkNN.Diagnostics$Variable_clusters_orig